Load the dataset we want.
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(janitor)
library(ggridges)
library(ggthemes)
library(stringr)
library(dplyr)
library(forcats)
#embedding plots in rmarkdown
knitr::opts_chunk$set(fig.width=12, fig.height=8, out.width = "80%")
theme_set(theme_bw())
First we import the data and clean it.
health =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'HEALTH') %>%
clean_names() %>%
select(1:7)
socioeconomic =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'SOCIOECONOMIC') %>%
clean_names() %>%
select(1:3, 10:18)
assistance =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'ASSISTANCE') %>%
clean_names() %>%
select(1:3, 23:29)
restaurant =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'RESTAURANTS') %>%
clean_names() %>%
select(1:9, 16:17)
county =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'Supplemental Data - County') %>%
clean_names()
state =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'Supplemental Data - State') %>%
clean_names() %>%
select(1:2, 9:20, 27:40)
store =
readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'STORES') %>%
clean_names() %>%
select(1:27)
health_state = health %>%
group_by(state) %>%
summarise(pct_diabetes_adults08 = mean(pct_diabetes_adults08),
pct_diabetes_adults13 = mean(pct_diabetes_adults13),
pct_obese_adults08 = mean(pct_obese_adults08),
pct_obese_adults13 = mean(pct_obese_adults13))
socioeconomic_state = socioeconomic %>%
group_by(state) %>%
summarise(pct_65older10 = mean(pct_65older10),
pct_18younger10 = mean(pct_18younger10),
medhhinc15 = mean(medhhinc15),
povrate15 = mean(povrate15),
childpovrate15 = mean(childpovrate15),
perpov10 = mean(perpov10)/n(),
perchldpov10 = mean(perchldpov10)/n())
social_health_whole = merge(socioeconomic, health,by=c("fips", "state", "county"))
social_health = merge(socioeconomic_state, health_state,by=c("state"))
#normally distributed
hist(health$pct_obese_adults13)
hist(health$pct_diabetes_adults13)
# median income VS obesity
social_health_whole %>%
group_by(state) %>%
ggplot(aes(x = medhhinc15, y = pct_obese_adults13)) +
geom_point(aes(color = state, size = 1), alpha = .6) +
geom_smooth() +
labs(
x = "Median household income, 2015",
y = "Percentage of adult obesity, 2013 ") +
theme(text = element_text(size = 14),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# median income VS diabetes
social_health_whole %>%
group_by(state) %>%
ggplot(aes(x = medhhinc15, y = pct_diabetes_adults13)) +
geom_point(aes(color = state, size = 1), alpha = .6) +
geom_smooth() +
labs(
x = "Median household income, 2015",
y = "Percentage of adult diabetes, 2013 ") +
theme(text = element_text(size = 14),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Diabetes VS obesity
social_health_whole %>%
group_by(state) %>%
ggplot(aes(x = pct_obese_adults13, y = pct_diabetes_adults13)) +
geom_point(aes(color = state, size = 1), alpha = .6) +
geom_smooth() +
labs(
x = "Percentage of adult obesity, 2013",
y = "Percentage of adult diabetes, 2013 ") +
theme(text = element_text(size = 14),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
#prepare dataset for lunch program
lunch = state[-52,] %>%
gather(key = "year", value = "lunch_participants", national_school_lunch_program_participants_fy_2009:national_school_lunch_program_participants_fy_2015) %>%
mutate(year = str_replace(year, "national_school_lunch_program_participants_fy_", "")) %>%
select(statefips, state, year, lunch_participants)
lunch_mean = lunch %>%
group_by(state) %>%
mutate(mean_participants = mean(lunch_participants)) %>%
ungroup(state) %>%
mutate(state = fct_reorder(state, mean_participants))
#Barplot of Average Number of Participants in School Lunch Program (2011-2015)
ggplot(lunch_mean, aes(x = state, y = mean_participants)) +
geom_bar(stat = "identity", fill = "blue", alpha = .6) +
labs(title = "Average Number of Participants in School Lunch Program (2011-2015)",
x = "state",
y = "Average number of participants") +
theme(legend.position = "bottom",
text = element_text(size = 12),
axis.text.x = element_text(angle = 90, hjust = 1))
health_13 = health %>%
group_by(state) %>%
mutate(diabetes_13 = mean(pct_diabetes_adults13, na.rm = T),
obesites_13 = mean(pct_obese_adults13, na.rm = T)) %>%
mutate(statefips = str_sub(fips, 1, 2))%>%
filter(!duplicated(state)) %>%
ungroup(state) %>%
select(statefips, diabetes_13, obesites_13) %>%
left_join(state, by = "statefips") %>%
rename(lunch_2009 = "national_school_lunch_program_participants_fy_2009",
lunch_2011 = "national_school_lunch_program_participants_fy_2011",
lunch_2012 = "national_school_lunch_program_participants_fy_2012",
lunch_2013 = "national_school_lunch_program_participants_fy_2013") %>%
select(statefips, state, diabetes_13, obesites_13, lunch_2009, lunch_2011, lunch_2012, lunch_2013)
#linear regression: diabetes_13 v.s. lunch participants (2013)
summary(lm(diabetes_13 ~ log(lunch_2013), data = health_13))
##
## Call:
## lm(formula = diabetes_13 ~ log(lunch_2013), data = health_13)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2232 -1.2219 -0.0489 0.9121 4.5302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7692 3.2166 0.550 0.58479
## log(lunch_2013) 0.6963 0.2507 2.777 0.00775 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.885 on 49 degrees of freedom
## Multiple R-squared: 0.136, Adjusted R-squared: 0.1184
## F-statistic: 7.712 on 1 and 49 DF, p-value: 0.00775